In [1]:
import numpy as np
import pandas as pd
import math 
from sklearn.model_selection import train_test_split
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
import copy

Problem2¶

In [2]:
aa=pd.DataFrame([[0,3,0,'Red'],[2,0,0,'Red'],[0,1,3,'Red'],[0,1,2,'Green'],[-1,0,1,'Green'],[1,1,2,'Red']],columns=['X1','X2','X3','Y'])
In [3]:
aa0=aa[['X1','X2','X3']]
In [4]:
x=[0,0,0]
In [5]:
[math.dist(aa0.iloc[i,], x) for i in range(len(aa))]
Out[5]:
[3.0,
 2.0,
 3.1622776601683795,
 2.23606797749979,
 1.4142135623730951,
 2.449489742783178]
In [6]:
aa.iloc[[1,3,4],3]
Out[6]:
1      Red
3    Green
4    Green
Name: Y, dtype: object

(a) Since there are more 'Green' labels in x=[0,0,0]'s neighbour, I think x=[0,0,0] should be labeled as 'Green'.

(b) If Bayes deci-sion boundary(K=3) in this problem is highly nonlinear, which means the boundary is wiggly, we need to decrease K. That's because, we need a more complex model, and if the K is so large, it might cause large bias. Also, since the dataset is larger, the variance caused by more complex model will reduce. Thus, based on the variance bias trade-off, we want to let K smaller.

Problem3¶

Data cleaning and processing¶

A breif review of data¶

In [7]:
df_train =pd.read_csv('diabetes_train.csv')
df_test =pd.read_csv('diabetes_test.csv')
In [8]:
df_train.describe()
Out[8]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 428.000000 428.000000 428.000000 428.000000 428.000000 428.000000 428.000000 428.000000 428.000000
mean 4.053738 124.752336 69.672897 20.072430 84.067757 32.549065 0.502308 34.329439 0.478972
std 3.538270 32.822486 19.135913 16.555687 124.157706 7.669440 0.347304 11.926841 0.500142
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 103.000000 64.000000 0.000000 0.000000 27.875000 0.253750 25.000000 0.000000
50% 3.000000 123.000000 72.000000 22.500000 0.000000 32.500000 0.402500 31.000000 0.000000
75% 7.000000 145.000000 80.000000 32.000000 130.000000 36.800000 0.675000 41.250000 1.000000
max 17.000000 199.000000 114.000000 99.000000 846.000000 59.400000 2.420000 81.000000 1.000000
In [9]:
df_train.isnull().sum()
Out[9]:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
In [10]:
[sum(df_train.iloc[:,i]==0) for i in range(8)]
Out[10]:
[64, 3, 19, 139, 218, 6, 0, 0]
In [11]:
[sum(df_test.iloc[:,i]==0) for i in range(8)]
Out[11]:
[20, 1, 9, 32, 51, 1, 0, 0]
In [12]:
df_train.plot(kind='box',
    subplots=True, 
    sharey=False, # use different y scale
    figsize=(20, 5));

After using descriptive statistics and ploting boxplot, we found 'Glucose', 'BloodPressure','SkinThickness','Insulin' and 'BMI' has zero value, which does not make sense.Since zero is not reasonable in 'Glucose', 'BloodPressure','SkinThickness','Insulin' and 'BMI', thus we should treat them as nan value.

Turn unreasonable value into missing value¶

In [13]:
df_train['SkinThickness_updated']=df_train['SkinThickness'].replace(0,np.nan)
df_train['Insulin_updated']=df_train['Insulin'].replace(0,np.nan)
df_train['BMI_updated']=df_train['BMI'].replace(0,np.nan)
df_train['Glucose_updated']=df_train['Glucose'].replace(0,np.nan)
df_train['BloodPressure_updated']=df_train['BloodPressure'].replace(0,np.nan)
In [14]:
df_train=df_train.drop(['Glucose','BloodPressure','SkinThickness','Insulin','BMI'],axis=1)
In [15]:
df_test['SkinThickness_updated']=df_test['SkinThickness'].replace(0,np.nan)
df_test['Insulin_updated']=df_test['Insulin'].replace(0,np.nan)
df_test['BMI_updated']=df_test['BMI'].replace(0,np.nan)
df_test['Glucose_updated']=df_test['Glucose'].replace(0,np.nan)
df_test['BloodPressure_updated']=df_test['BloodPressure'].replace(0,np.nan)
In [16]:
df_test=df_test.drop(['Glucose','BloodPressure','SkinThickness','Insulin','BMI'],axis=1)

Missing data pattern¶

In [17]:
df_train.isnull().sum()
Out[17]:
Pregnancies                   0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
SkinThickness_updated       139
Insulin_updated             218
BMI_updated                   6
Glucose_updated               3
BloodPressure_updated        19
dtype: int64
In [18]:
df_test.isna().sum()
Out[18]:
Pregnancies                  0
DiabetesPedigreeFunction     0
Age                          0
Outcome                      0
SkinThickness_updated       32
Insulin_updated             51
BMI_updated                  1
Glucose_updated              1
BloodPressure_updated        9
dtype: int64
In [19]:
import missingno as msno
msno.matrix(df_train, figsize=(12,4));
In [20]:
msno.bar(df_train, figsize=(12,4));

From graph, we could conclude that 'Glucose', 'BloodPressure','SkinThickness','Insulin' and 'BMI' has missing value. And the situation is more serious in columns 'SkinThickness' and 'Insulin'. Thus, we need to implement data imputation to fill the missing value.

Missing data imputation¶

In [21]:
df_train.shape
Out[21]:
(428, 9)

Since our data amount is not large enough, we had better not use dropna method to process missing value

mean and median imputation.¶
In [22]:
df_train.plot(kind='box',
    subplots=True, 
    sharey=False, # use different y scale
    figsize=(20, 5));

From the graph, it is obvious that 'BloodPressure','SkinThickness','Insulin' and 'BMI' has outliers. Thus we choose to use median imputation for 'BloodPressure','SkinThickness','Insulin' and 'BMI'. Use mean imputation for 'Glucose'.

In [23]:
df_train_clean = copy.deepcopy(df_train)
df_test_clean = copy.deepcopy(df_test)

df_train_clean.iloc[:,4] = df_train_clean.iloc[:,4].fillna(df_train_clean.iloc[:,4].median())
df_test_clean.iloc[:,4] = df_test_clean.iloc[:,4].fillna(df_test_clean.iloc[:,4].median())

df_train_clean.iloc[:,5] = df_train_clean.iloc[:,5].fillna(df_train_clean.iloc[:,5].median())
df_test_clean.iloc[:,5] = df_test_clean.iloc[:,5].fillna(df_test_clean.iloc[:,5].median())

df_train_clean.iloc[:,6] = df_train_clean.iloc[:,6].fillna(df_train_clean.iloc[:,6].median())
df_test_clean.iloc[:,6] = df_test_clean.iloc[:,6].fillna(df_test_clean.iloc[:,6].median())

df_train_clean.iloc[:,7] = df_train_clean.iloc[:,7].fillna(df_train_clean.iloc[:,7].mean())
df_test_clean.iloc[:,7] = df_test_clean.iloc[:,7].fillna(df_test_clean.iloc[:,7].mean())

df_train_clean.iloc[:,8] = df_train_clean.iloc[:,8].fillna(df_train_clean.iloc[:,8].median())
df_test_clean.iloc[:,8] = df_test_clean.iloc[:,8].fillna(df_test_clean.iloc[:,8].median())
In [24]:
df_train_clean.isna().sum()
Out[24]:
Pregnancies                 0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
SkinThickness_updated       0
Insulin_updated             0
BMI_updated                 0
Glucose_updated             0
BloodPressure_updated       0
dtype: int64
In [25]:
df_test_clean.isna().sum()
Out[25]:
Pregnancies                 0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
SkinThickness_updated       0
Insulin_updated             0
BMI_updated                 0
Glucose_updated             0
BloodPressure_updated       0
dtype: int64

Now there is no missing value!! Our data is cleaned!!!

In [26]:
df_train_clean['Outcome'].value_counts()
Out[26]:
0    223
1    205
Name: Outcome, dtype: int64
In [27]:
df_test_clean['Outcome'].value_counts()
Out[27]:
1    63
0    45
Name: Outcome, dtype: int64

It could be concluded that the 'Outcome' structure in test and train data is similar. Thus, we could start to our EDA and KNN.

EDA¶

In [28]:
sns.pairplot(df_train_clean, diag_kind="kde", hue='Outcome', height=2);
In [29]:
df_train_clean.plot(kind='box',by='Outcome',
    subplots=True, 
    sharey=False, # use different y scale
    figsize=(20, 5));
In [30]:
import plotly.express as px
px.box(df_train_clean.melt(id_vars=['Outcome'], var_name = "col" ), x = "Outcome", y='value', 
       color = 'Outcome',facet_col='col').update_yaxes(matches=None)

We could notice the 'Glucose' with ('BloodPressure','SkinThickness','Insulin', 'BMI') pairplots seems obviously could identify the "Outcome". Also, the distribution of 'Glucose','BloodPressure','SkinThickness','Insulin', 'BMI','Age','DiabetesPedigreeFunction' columns are all slightly different depending on the different 'Outcome', which could be invesigated in depth.

Standarize our data¶

In [31]:
from sklearn.preprocessing import StandardScaler

# extract design matrix and target variable
X_train = df_train_clean.drop(['Outcome'], axis=1)
y_train = df_train_clean['Outcome']
X_test = df_test_clean.drop(['Outcome'], axis=1)
y_test = df_test_clean['Outcome'] 

# get predictors' names
col_names = X_train.columns

# initiate StandardScaler
scaler = StandardScaler()

# fit only on X_train
scaler.fit(X_train)

# scale both X_train and X_test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
In [32]:
X_train = pd.DataFrame(X_train, columns=col_names)
X_test = pd.DataFrame(X_test, columns=col_names)
In [33]:
X_train.describe()
Out[33]:
Pregnancies DiabetesPedigreeFunction Age SkinThickness_updated Insulin_updated BMI_updated Glucose_updated BloodPressure_updated
count 4.280000e+02 4.280000e+02 4.280000e+02 4.280000e+02 4.280000e+02 4.280000e+02 4.280000e+02 4.280000e+02
mean 2.490220e-17 -6.640586e-17 -7.678178e-17 -1.400749e-16 -6.640586e-17 2.116687e-16 2.365709e-16 -4.233374e-16
std 1.001170e+00 1.001170e+00 1.001170e+00 1.001170e+00 1.001170e+00 1.001170e+00 1.001170e+00 1.001170e+00
min -1.147025e+00 -1.223149e+00 -1.118908e+00 -2.555907e+00 -1.506390e+00 -2.241627e+00 -2.627938e+00 -3.632269e+00
25% -8.640698e-01 -7.165163e-01 -7.831376e-01 -3.434050e-01 -2.025617e-01 -7.313489e-01 -6.964103e-01 -6.728765e-01
50% -2.981603e-01 -2.877165e-01 -2.794819e-01 2.067759e-02 -2.025617e-01 -6.894646e-02 -8.475997e-02 8.811008e-02
75% 8.336587e-01 4.978159e-01 5.809300e-01 2.447284e-01 -2.025617e-01 5.745302e-01 6.234667e-01 5.954345e-01
max 3.663206e+00 5.528106e+00 3.917649e+00 7.750431e+00 7.571310e+00 3.996312e+00 2.361841e+00 3.470273e+00

Conduct KNN¶

In [34]:
k_range = np.arange(1,31)
train_error = []
test_error = []

for k in k_range:
    # setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # fit the model
    knn.fit(X_train, y_train)
    
    # train error rate
    pred_train = knn.predict(X_train)
    train_error.append(np.mean(pred_train != y_train))
    
    # test error rate
    pred_test = knn.predict(X_test)
    test_error.append(np.mean(pred_test != y_test))
In [35]:
plt.plot(k_range, train_error, label='Training Error')
plt.plot(k_range, test_error, label='Test Error')
plt.legend()
plt.title('Training and test error rate for KNN(mean and median imputation)')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.show()
In [36]:
test_error.index(min(test_error))
Out[36]:
2
In [37]:
min(test_error)
Out[37]:
0.21296296296296297

As a result, the best K value is 3, since when K=3, test_error will be minimized.

In [38]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
knn = KNeighborsClassifier(n_neighbors=3)

# fit the model on the training set
knn.fit(X_train, y_train);

pred_train = knn.predict(X_train)

# make predictions on test data
pred_test = knn.predict(X_test)

 
cm_train = confusion_matrix(y_train, pred_train)

cm_display = ConfusionMatrixDisplay(confusion_matrix = cm_train, display_labels = ['High', 'Low'])
cm_display.plot()
plt.show()

With the confusion matrix, we could see the predicted value is correct in most situation.

Implement for MICE imputation¶

In [39]:
df_train_clean0 = copy.deepcopy(df_train)
df_test_clean0 = copy.deepcopy(df_test)
In [40]:
X_train_mice = df_train_clean0.drop(['Outcome'], axis=1)
y_train_mice = df_train_clean0['Outcome']
X_test_mice = df_test_clean0.drop(['Outcome'], axis=1)
y_test_mice = df_test_clean0['Outcome'] 
In [41]:
col_names = X_train_mice.columns
In [42]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=10, random_state=42)
X_train_mice = imp.fit_transform(X_train_mice)
X_test_mice = imp.fit_transform(X_test_mice)
In [43]:
X_train_mice=pd.DataFrame(X_train_mice,columns=col_names)
X_test_mice=pd.DataFrame(X_test_mice,columns=col_names)
In [44]:
scaler1 = StandardScaler()

# fit only on X_train
scaler1.fit(X_train_mice)

# scale both X_train and X_test
X_train_mice = scaler.transform(X_train_mice)
X_test_mice = scaler.transform(X_test_mice)
In [45]:
X_train_mice = pd.DataFrame(X_train_mice, columns=col_names)
X_test_mice = pd.DataFrame(X_test_mice, columns=col_names)
In [46]:
k_range = np.arange(1,31)
train_error = []
test_error = []

for k in k_range:
    # setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # fit the model
    knn.fit(X_train_mice, y_train)
    
    # train error rate
    pred_train = knn.predict(X_train_mice)
    train_error.append(np.mean(pred_train != y_train_mice))
    
    # test error rate
    pred_test = knn.predict(X_test_mice)
    test_error.append(np.mean(pred_test != y_test_mice))
In [47]:
plt.plot(k_range, train_error, label='Training Error')
plt.plot(k_range, test_error, label='Test Error')
plt.legend()
plt.title('Training and test error rate for KNN(MICE)')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.show()
In [48]:
test_error.index(min(test_error))
Out[48]:
2
In [49]:
min(test_error)
Out[49]:
0.25

For MICE, the best K value is still 3, since when K=3, test_error will be minimized.

In a conclusion, K=3 is the proper value for KNN model.